*************************************************
* Purpose -- Preparing different samples for 
* analysis using QCEW data QUARTERLY
************************************************
clear all
* RUN 00_path_master.do FIRST TO GET FILEPATHS


use "$clean/qcew/qcew_quarterly_clean_county.dta", clear
sort countyfip yq, stable
xtset countyfips yq

************************************************
* CZ-state dataset
************************************************

frame copy default cz_state
frame change cz_state

sum pop if yq == 236, d
local county_pop_total = `r(sum)' //this is to get proportion of total pop covered

* keeping only balanced sample
keep if balanced_full_75 == 1

sum pop if yq == 236, d
local county_pop_balanced = `r(sum)'

* calculating share of population kept
global county_share = (`county_pop_balanced'/`county_pop_total')*100
global county_share: disp %4.2f $county_share

* calculating share of counties kept
distinct countyfips
global county_num_balanced = `r(ndistinct)'
global county_coverage = (${county_num_balanced}/3109)*100
global county_coverage: disp %3.2f $county_coverage

egen czstate = group(czone statefips)

* collapsing dataset to cz-state-quarter level
collapse (mean) mw fed_mw index czone statefips (sum) qtrly_estabs70 qtrly_avg_emplvl70 ///
total_qtrly_wages70 qtrly_estabs75 qtrly_avg_emplvl75 total_qtrly_wages75 pop ///
(first) statename year, by(czstate yq)

* creating basic variables
gen lemp75 = ln(qtrly_avg_emplvl75)
gen lmw = ln(mw)
gen lpop = ln(pop)

gen avg_earnings = (total_qtrly_wages75/qtrly_avg_emplvl75)/13.035 // average weekly earnings per worker
gen real_earnings = avg_earnings/index
gen learn75 = ln(real_earnings)

gen lemp_other = ln(qtrly_avg_emplvl70 - qtrly_avg_emplvl75)

gen avg_earnings_other = ((total_qtrly_wages70 - total_qtrly_wages75)/(qtrly_avg_emplvl70 - qtrly_avg_emplvl75))/13.035
gen real_earnings_other = avg_earnings_other/index
gen learn_other = ln(real_earnings_other)

xtset czstate yq
tempfile czstate_yq
save `czstate_yq'

save "$clean/qcew/cz_state_sample_qtr.dta", replace

* creating another CZ-state dataset with all counties and only population data to be used later
frame copy default cz_state_pop
frame change cz_state_pop

egen czstate = group(czone statefips)

collapse (mean) czone statefips (sum) pop (first) statename year, by(czstate yq)

tempfile czstate_yq_pop
save `czstate_yq_pop'


************************************************
* MSCZ pairs dataset
************************************************
frame create mscz
frame change mscz

* using the next few lines of code to just get the pairs as per JNR
use "$raw/jnr-cbp/cbp_stacked_czonestatepair_sample.dta", clear

rename (czonestatepair state) (pair_id statefips)
egen pair_id_czstate = group(pair_id czonestate) 
keep pair_id pair_id_czstate czone statefips year

* creating "observations" for quarters for merging
expand 4
bysort pair_id czone statefips pair_id_czstate year: gen qtr = _n

gen yq = qofd(dofq(yq(year, qtr)))
format yq %tq

* creating "observations" for quarters in 2017-2019 for merging
forvalues i = 228/239 {
	insobs 1
	replace yq = `i' if _n == _N
}

fillin pair_id_czstate yq
drop if mi(pair_id_czstate)
drop _fillin

drop year qtr

* some years were missing in CBP data, and also pair_id and county will be missing for 1990-2019 data
* just copying these variables within a pair_id_county

foreach var in pair_id czone statefips {
	bys pair_id_czstate (`var'): replace `var' = `var'[1] if mi(`var')
}


foreach var of varlist _all {
	assert ~mi(`var')
}

* next few lines of code (within the two preserve-restores are used to calculate
* coverage in terms of pairs and in terms of population for this sample)
preserve
	merge m:1 czone state yq using `czstate_yq_pop'
	keep if _merge == 3
	drop _merge
	
	duplicates drop czone statefips yq, force
	sum pop if yq == 236, d
	local cz_pop_total = `r(sum)'
restore

preserve
	merge m:1 czone state yq using `czstate_yq'
	keep if _merge == 3
	drop _merge
	
	bys pair_id: gen obs = _N
	keep if obs == 240
	distinct pair_id
	global cz_num_balanced = `r(ndistinct)'
	global cz_coverage = (${cz_num_balanced}/151)*100
	global cz_coverage: disp %4.2f $cz_coverage
	duplicates drop czone statefips yq, force
	sum pop if yq == 236, d
	local cz_pop_balanced = `r(sum)'
	global cz_share = (`cz_pop_balanced'/`cz_pop_total')*100
	global cz_share: disp %4.2f $cz_share
	
restore
	
* now merging JNR pairs with our CZ_state dataset	
merge m:1 czone state yq using `czstate_yq'

* only keeping CZ_state that are part of MSCZ pairs
keep if _merge == 3
drop _merge

* dropping pair_id_czstate for whom we do not have data for the other pair (mostly because of balanced data)
bys pair_id: gen obs = _N
keep if obs == 240

xtset pair_id_czstate yq
egen pair_id_num = group(pair_id)

save "$clean/qcew/mscz_sample_qtr.dta", replace


************************************************
* Border county pairs dataset
************************************************
* starting from county-level dataset instead of CZ-state
frame copy default bcp1
frame change bcp1

* creating variables like before
gen lemp75 = ln(qtrly_avg_emplvl75)
gen lmw = ln(mw)
gen lpop = ln(pop)

gen avg_earnings = (total_qtrly_wages75/qtrly_avg_emplvl75)/13.035
gen real_earnings = avg_earnings/index
gen learn75 = ln(real_earnings)

gen lemp_other = ln(qtrly_avg_emplvl70 - qtrly_avg_emplvl75)

gen avg_earnings_other = ((total_qtrly_wages70 - total_qtrly_wages75)/(qtrly_avg_emplvl70 - qtrly_avg_emplvl75))/13.035
gen real_earnings_other = avg_earnings_other/index
gen learn_other = ln(real_earnings_other)

tempfile county_yq
save `county_yq'

* again getting county pairs from JNR's replication files
use "$raw/jnr-cbp/cbp_stacked_countypair_sample.dta", clear

* Keep only border counties
	merge m:1 countypair_id county using "$raw/jnr-cbp/countypairtypes.dta", nogen assert(3)
	keep if type0 == 1
drop type*

rename (countypair_id county) (pair_id countyfips)
egen pair_id_county = group(pair_id countyfips) 
keep pair_id pair_id_county countyfips year

* creating "observations" for quarters for merging
expand 4
bysort pair_id pair_id_county countyfips year: gen qtr = _n

gen yq = qofd(dofq(yq(year, qtr)))
format yq %tq

* creating "observations" for quarters in 2017-2019 for merging
forvalues i = 228/239 {
	insobs 1
	replace yq = `i' if _n == _N
}

fillin pair_id_county yq
drop if mi(pair_id_county)
drop _fillin

drop year qtr

* some years were missing in BCP data, and also pair_id and county will be missing for 1990-2019 data
* just copying these variables within a pair_id_county

bys pair_id_county (pair_id): replace pair_id = pair_id[_N] if mi(pair_id)
bys pair_id_county (countyfips): replace countyfips = countyfips[1] if mi(countyfips)

foreach var of varlist _all {
	assert ~mi(`var')
}

tempfile bcp
save `bcp'

*now merging JNR pairs with our county dataset
merge m:1 countyfips yq using `county_yq'

keep if _merge == 3
drop _merge

preserve
	duplicates drop countyfips yq, force
	sum pop if yq == 236, d
	local bcp_pop_total = `r(sum)'
restore

keep if balanced_full_75 == 1

* dropping counties for whom we do not have data for the other pair due to balanced pair
bys pair_id: gen obs = _N
keep if obs == 240

distinct pair_id
global bcp_num_balanced = `r(ndistinct)'
global bcp_coverage = (${bcp_num_balanced}/1165)*100
global bcp_coverage: disp %4.2f $bcp_coverage

preserve
	duplicates drop countyfips year, force
	sum pop if yq == 236, d
	local bcp_pop_balanced = `r(sum)'
	global bcp_share = (`bcp_pop_balanced'/`bcp_pop_total')*100
	global bcp_share: disp %4.2f $bcp_share
restore

xtset pair_id_county yq
egen pair_id_num = group(pair_id)

save "$clean/qcew/bcp_sample_qtr.dta", replace


*************************************************
* Create annual data
************************************************
clear all

use "$clean/qcew/qcew_quarterly_annual_clean_county.dta", clear
sort countyfip year, stable
xtset countyfips year

************************************************
* CZ-state dataset
************************************************

frame copy default cz_state
frame change cz_state

sum pop if year == 2019, d
local county_pop_total = `r(sum)' //this is to get proportion of total pop covered

* keeping only balanced sample
keep if balanced_full_75 == 1

sum pop if year == 2019, d
local county_pop_balanced = `r(sum)'

* calculating share of population kept
global county_share = (`county_pop_balanced'/`county_pop_total')*100
global county_share: disp %4.2f $county_share

* calculating share of counties kept
distinct countyfips
global county_num_balanced = `r(ndistinct)'
global county_coverage = (${county_num_balanced}/3109)*100
global county_coverage: disp %3.2f $county_coverage

egen czstate = group(czone statefips)

* collapsing dataset to cz-state-year level
collapse (mean) mw fed_mw index czone statefips (sum) annual_avg_estabs70 annual_avg_emplvl70 ///
total_annual_wages70 annual_avg_estabs75 annual_avg_emplvl75 total_annual_wages75 ///
pop (first) statename, by(czstate year)

* creating basic variables
gen lemp75 = ln(annual_avg_emplvl75)
gen lmw = ln(mw)
gen lpop = ln(pop)

gen avg_earnings = (total_annual_wages75/annual_avg_emplvl75)/52.14 // average weekly earnings per worker
gen real_earnings = avg_earnings/index
gen learn75 = ln(real_earnings)

gen lemp_other = ln(annual_avg_emplvl70 - annual_avg_emplvl75)

gen avg_earnings_other = ((total_annual_wages70 - total_annual_wages75)/(annual_avg_emplvl70 - annual_avg_emplvl75))/52.14
gen real_earnings_other = avg_earnings_other/index
gen learn_other = ln(real_earnings_other)

xtset czstate year
tempfile czstate_year
save `czstate_year'

save "$clean/qcew/cz_state_sample_qtr_annual.dta", replace


* creating another CZ-state dataset with all counties and only population data to be used later
frame copy default cz_state_pop
frame change cz_state_pop

egen czstate = group(czone statefips)

collapse (mean) czone statefips (sum) pop (first) statename, by(czstate year)

tempfile czstate_year_pop
save `czstate_year_pop'


************************************************
* MSCZ pairs dataset
************************************************
frame create mscz
frame change mscz

* using the next few lines of code to just get the pairs as per JNR
use "$raw/jnr-cbp/cbp_stacked_czonestatepair_sample.dta", clear

rename (czonestatepair state) (pair_id statefips)
egen pair_id_czstate = group(pair_id czonestate) 
sort pair_id_czstate year, stable
keep pair_id pair_id_czstate czone statefips year

* creating observations for 2017-2019 for merging
forvalues i = 2017/2019 {
	insobs 1
	replace year = `i' if _n == _N
}

fillin pair_id_czstate year
drop if mi(pair_id_czstate)
drop _fillin


* some years were missing in CBP data, and also pair_id and county will be missing for 1990-2019 data
* just copying these variables within a pair_id_county

foreach var in pair_id czone statefips {
	bys pair_id_czstate (`var'): replace `var' = `var'[1] if mi(`var')
}


foreach var of varlist _all {
	assert ~mi(`var')
}

* next few lines of code (within the two preserve-restores are used to calculate
* coverage in terms of pairs and in terms of population for this sample)
preserve
	merge m:1 czone state year using `czstate_year_pop'
	keep if _merge == 3
	drop _merge
	
	duplicates drop czone statefips year, force
	sum pop if year == 2019, d
	local cz_pop_total = `r(sum)'
	
restore

preserve
	merge m:1 czone state year using `czstate_year'
	keep if _merge == 3
	drop _merge
	
	bys pair_id: gen obs = _N
	keep if obs == 60
	distinct pair_id
	global cz_num_balanced = `r(ndistinct)'
	global cz_coverage = (${cz_num_balanced}/151)*100
	global cz_coverage: disp %4.2f $cz_coverage
	duplicates drop czone statefips year, force
	sum pop if year == 2019, d
	local cz_pop_balanced = `r(sum)'
	global cz_share = (`cz_pop_balanced'/`cz_pop_total')*100
	global cz_share: disp %4.2f $cz_share
	
restore
	
* now merging JNR pairs with our CZ_state dataset	
merge m:1 czone state year using `czstate_year'

* only keeping CZ_state that are part of MSCZ pairs
keep if _merge == 3
drop _merge

* dropping pair_id_czstate for whom we do not have data for the other pair (mostly because of balanced data)
bys pair_id: gen obs = _N
keep if obs == 60

xtset pair_id_czstate year
egen pair_id_num = group(pair_id)

save "$clean/qcew/mscz_sample_qtr_annual.dta", replace


************************************************
* Border county pairs dataset
************************************************
* starting from county-level dataset instead of CZ-state
frame copy default bcp1
frame change bcp1

* creating variables like before
gen lemp75 = ln(annual_avg_emplvl75)
gen lmw = ln(mw)
gen lpop = ln(pop)

gen avg_earnings = (total_annual_wages75/annual_avg_emplvl75)/52.14
gen real_earnings = avg_earnings/index
gen learn75 = ln(real_earnings)

gen lemp_other = ln(annual_avg_emplvl70 - annual_avg_emplvl75)

gen avg_earnings_other = ((total_annual_wages70 - total_annual_wages75)/(annual_avg_emplvl70 - annual_avg_emplvl75))/52.14
gen real_earnings_other = avg_earnings_other/index
gen learn_other = ln(real_earnings_other)

tempfile county_year
save `county_year'

* again getting county pairs from JNR's replication files
use "$raw/jnr-cbp/cbp_stacked_countypair_sample.dta", clear

* Keep only border counties
	merge m:1 countypair_id county using "$raw/jnr-cbp/countypairtypes.dta", nogen assert(3)
	keep if type0 == 1
drop type*

rename (countypair_id county) (pair_id countyfips)
egen pair_id_county = group(pair_id countyfips) 
sort pair_id_county year, stable
keep pair_id pair_id_county countyfips year

forvalues i = 2017/2019 {
	insobs 1
	replace year = `i' if _n == _N
}

fillin pair_id_county year
drop if mi(pair_id_county)
drop _fillin

* some years were missing in BCP data, and also pair_id and county will be missing for 1990-2019 data
* just copying these variables within a pair_id_county

bys pair_id_county (pair_id): replace pair_id = pair_id[_N] if mi(pair_id)
bys pair_id_county (countyfips): replace countyfips = countyfips[1] if mi(countyfips)

foreach var of varlist _all {
	assert ~mi(`var')
}

tempfile bcp
save `bcp'

*now merging JNR pairs with our county dataset
merge m:1 countyfips year using `county_year'

keep if _merge == 3
drop _merge

preserve
	duplicates drop countyfips year, force
	sum pop if year == 2019, d
	local bcp_pop_total = `r(sum)'
restore

keep if balanced_full_75 == 1

* dropping counties for whom we do not have data for the other pair due to balanced pair
bys pair_id: gen obs = _N
keep if obs == 60

distinct pair_id
global bcp_num_balanced = `r(ndistinct)'
global bcp_coverage = (${bcp_num_balanced}/1165)*100
global bcp_coverage: disp %4.2f $bcp_coverage

preserve
	duplicates drop countyfips year, force
	sum pop if year == 2019, d
	local bcp_pop_balanced = `r(sum)'
	global bcp_share = (`bcp_pop_balanced'/`bcp_pop_total')*100
	global bcp_share: disp %4.2f $bcp_share
restore

xtset pair_id_county year
egen pair_id_num = group(pair_id)

save "$clean/qcew/bcp_sample_qtr_annual.dta", replace




